/*******************************************************************************
  ARTICLE	GAY, GOBBI, GONI (2025) "REVOLUTIONARY TRANSITIONS. INHERITANCE    
            CHANGE AND FERTILITY DECLINE" JOURNAL OF POLITICAL ECONOMY         
                                                                               
  AUTHORS	VICTOR GAY, PAULA GOBBI, MARC GONI                                 
  CONTACT	victor.gay@tse-fr.eu; paula.eugenia.gobbi@ulb.be; marc.goni@uib.no 
  VERSION	1.0 (MAY 2025)                                                     
  SOFTWARE	STATA SE 18                                                        
  LICENCE	MIT                                                                
--------------------------------------------------------------------------------

GENI DATA PREPARATION DO FILE - INHERITANCE VARIABLES AND CONTROLS

This file combines the Geni sample and data on inheritance customs and controls.

Instructions: 
-------------
	Gain access to the Geni database from MyHeritage, Ltd and either:
	
	1) place the following files:

		geni_profiles.csv (20.2 GB, created on April 4, 2022, at 11:39:31),
		geni_unions.csv (7.2 GB, created on April 4, 2022, at 11:55:59),
		geni_union details.csv (2.6 GB, created on April 4, 2022, at 11:54:34).
	
	in folder /1_raw_data/1_1_henri/ (see README for more details) and run R-codes 
	named 1-* to 8-* in folder "/2_scripts/2_1_data/01_geni_data_to_sample"; or
	
	2) place the author-provided fr-clean.csv file into the folder 3_outputs/3_1_datasets.
 
	Open do-files from directory where they are placed; order matters; run whole code.

Do-file structure: 
------------------
	0. PROGRAM SETUP
	1. TREATMENT VARIABLES FOR GENI DATA
	2. CONTROL VARIABLES FOR GENI DATA
	3. MERGE ALL CONTROLS FOR GENI DATA

Main sources: 
-------------
	Geni database - online genealogies in geni.com, a MyHeritage Company
	Inheritance customs (own collection)
	Geolocated communes Henry (own collection)
		 
Other sources: 
--------------
	Wheat prices (Ridolfi 2019)
	Population density (Cristofoli et al., 2021; IGN, 2021)
	Administrative centers (Nordman, Ozouf-Marignier, and Laclau, 1989 pp. 74–80)
	Political societies (Boutier, Boutry, and Bonin, 1992, pp. 77–101)
	Cassini road (Perret, Gribaudi, and Barthelemy, 2015)
	Rebellions (Nicolas 2002; Gay 2025)
	Horse-post network (Albertus and Gay, 2025)
	Caloric suitability (Galor and Ozak, 2016)
	Ruggedness (Nunn and Puga, 2012)
	Soil texture (INRA, 1998)
	Refractory clergy in 1791 (Tacket 1984)
		 
*/
********************************************************************************

*/

********************
* 0. PROGRAM SETUP *
********************

version 18
clear all
set more off

************************
* PACKAGE DEPENDENCIES *
************************

ssc install fre

***************
* DIRECTORIES *
***************

global DATA 	"../../1_raw_data"
global SCRIPTS 	"../2_1_data"
global TEMP 	"../2_0_tempfiles"

/* see readme file for details on creation of csv files */

timer on 1 

****************************************
* 1. TREATMENT VARIABLES FOR GENI DATA *
****************************************

* TREATMENT STATUSES

** TREATED / UNTREATED MUNICIPALITIES

import delimited "$DATA/1_4_customs/affected.csv", clear
drop if missing(imp_exc)
keep insee_com1 imp_exc
generate  affected = 1 if imp_exc == 1
replace affected = 0 if imp_exc == 0
label define affected_lbl 0 "Not affected" 1 "Affected"
label values affected affected_lbl
label variable affected "Impartible or women excluded before 1793 reforms [Gay, Gobbi, Goñi 2023]"
drop imp_exc
save "$TEMP/affected", replace

** PARTIBLE / IMPARTIBLE MUNICIPALITIES

import delimited "$DATA/1_4_customs/partimpart.csv", clear
drop if missing(partibleim)
keep insee_com1 partibleim

generate partible = .
replace  partible = 1 if partibleim == "partible"
replace partible = 0 if partibleim == "impartible"
label define partible_lbl 0 "Impartible" 1 "Partible"
label values partible partible_lbl
label variable partible "Partible before 1793 reforms [Gay, Gobbi, Goñi 2023]"

generate impart = .
replace  impart = 1 if partibleim == "impartible"
replace impart = 0 if partibleim == "partible"
label define impart_lbl 0 "Partible" 1 "Impartible"
label values impart impart_lbl
label variable impart "Impartible before 1793 reforms [Gay, Gobbi, Goñi 2023]"
drop partibleim

save "$TEMP/partimpart", replace

** WOMEN INCLUDED / EXCLUDED MUNICIPALITIES

import delimited "$DATA/1_4_customs/fem_inherit.csv", clear
drop if missing(female_exc)
keep insee_com1 female_exc

generate fem_included = .
replace  fem_included = 1 if female_exc == "no"
replace  fem_included = 0 if female_exc == "yes"
label define fem_included_lbl 0 "Women excluded" 1 "Women included"
label values fem_included fem_included_lbl
label variable fem_included "Women included in inheritance before 1793 reforms [Gay, Gobbi, Goñi 2023]"

generate womenexc = .
replace  womenexc = 1 if female_exc == "yes"
replace  womenexc = 0 if female_exc == "no"
label define womenexc_lbl 0 "Women included" 1 "Women excluded"
label values womenexc womenexc_lbl
label variable womenexc "Women excluded from inheritance before 1793 reforms [Gay, Gobbi, Goñi 2023]"
drop female_exc

save "$TEMP/fem_inherit", replace

* DISTANCES TO TREATMENT FRONTIER

import delimited "$DATA/1_4_customs/affected_distance.csv", clear
keep insee_com1 hubdist
drop if missing(insee_com1)
rename hubdist dist_affected
label variable dist_affected "Distance to inheritance frontier (km) [Gay, Gobbi, Goñi 2023]"
save "$TEMP/dist_treatment", replace

* FRONTIER SEGMENTS

foreach i of numlist 50 100 {
	
	import delimited "$DATA/1_4_customs/affected_segment_`i'_length.csv", clear
	rename fid segment_affected_`i'
	label variable segment_affected_`i' "Nearest inheritance frontier segment id (`i'km)"
	rename length segment_affected_`i'_length
	label variable segment_affected_`i'_length "Nearest inheritance frontier segment length (`i'km)"
	save "$TEMP/segment_affected_`i'_length", replace

	import delimited "$DATA/1_4_customs/affected_segment_`i'.csv", clear
	keep insee_com1 hubname
	drop if missing(insee_com1)

	rename hubname segment_affected_`i'
	label variable segment_affected_`i' "Nearest inheritance frontier segment id (`i'km)"

	merge m:1 segment_affected_`i' using "$TEMP/segment_affected_`i'_length", ///
		assert(2 3) keep(3) nogenerate
		
	save "$TEMP/segment_affected_`i'", replace	
}

use "$TEMP/segment_affected_50", clear
merge 1:1 insee_com1 using "$TEMP/segment_affected_100", assert(3) nogenerate
erase "$TEMP/segment_affected_50.dta"
erase "$TEMP/segment_affected_100.dta"

save "$TEMP/affected_segment", replace

* MERGE TREATMENT VARIABLES

use "$TEMP/affected", clear
merge 1:1 insee_com1 using "$TEMP/partimpart", assert(3) nogenerate
merge 1:1 insee_com1 using "$TEMP/fem_inherit", assert(3) nogenerate
merge 1:1 insee_com1 using "$TEMP/dist_treatment", assert(2 3) nogenerate
merge 1:1 insee_com1 using "$TEMP/affected_segment", assert(2 3) nogenerate

compress
save "$TEMP/data_treatments", replace

erase "$TEMP/affected.dta"
erase "$TEMP/partimpart.dta"
erase "$TEMP/fem_inherit.dta"
erase "$TEMP/dist_treatment.dta"
erase "$TEMP/affected_segment.dta"


**************************************
* 2. CONTROL VARIABLES FOR GENI DATA *
**************************************

* MUNICIPALITY COORDINATES

import delimited "$DATA/1_3_france/coordinates_wgs84.csv", clear
keep latitude longitude insee_com1 x y nom_comm
drop if insee_com1 == .
label variable latitude "Latitude (WGS 84) [GEOFLA 2011]"
label variable longitude "Longitude (WGS 84) [GEOFLA 2011]"
rename x_cheflieu longitude_rgf93
rename y_cheflieu latitude_rgf93
label variable latitude_rgf93 "Latitude (RGF 93)  [GEOFLA 2011]"
label variable longitude_rgf93 "Longitude (RGF 93)  [GEOFLA 2011]"
replace latitude_rgf93 = latitude_rgf93 / 1000 /* convert to km */
replace longitude_rgf93 = longitude_rgf93 / 1000 /* convert to km */
save "$TEMP/coordinates", replace

* BAILLIAGE IDENTIFIERS

import delimited "$DATA/1_5_bailliages/bailliages.csv", clear
keep insee_com1 bai bai_nam
drop if missing(insee_com1)
rename bai bailliage_id 
label variable bailliage_id "Bailliage identifier [Gay, Gobbi, Goñi 2023]"
rename bai_nam bailliage_name 
label variable bailliage_name "Bailliage name [Gay, Gobbi, Goñi 2023]"
save "$TEMP/bailliage", replace

* WHEAT PRICES

** FORMAT WHEAT PRICES DATA

*** PRICE LOCATIONS

/* set of price locations */
import excel "$DATA/1_6_wheat_prices/wheat_prices_ridolfi.xlsx",             ///
	sheet("locations") firstrow clear
drop if country != "France"
drop country
save "$TEMP/wheat_prices_locations", replace

/* cities-decade grid */
keep insee_com
duplicates drop
expand 376
sort insee_com
by insee_com, sort: generate year = _n + 1499

generate decade = ., before(year)
forvalues y = 1700 (10) 1790 {
	local j = `y' + 9
	replace decade = `y' if year >= `y' & year <= `j'
}

keep if !missing(decade)

save "$TEMP/wheat_prices_grid", replace

*** FORMAT WHEAT PRICES

import excel "$DATA/1_6_wheat_prices/wheat_prices_ridolfi.xlsx",             ///
	sheet("Wheat") firstrow clear
keep place year sousliter source
rename place com_nom
rename sousliter price

/* drop observations with missing prices */
keep if !missing(price)

/* drop out of France (or France average) and merge with locations */
drop if com_nom == "France"
drop if com_nom == "Bale"
drop if com_nom == "Mons"
drop if com_nom == "Stura"
drop if com_nom == "Ath"
merge m:1 com_nom using "$TEMP/wheat_prices_locations", assert(2 3) keep(3)  ///
	nogenerate
erase "$TEMP/wheat_prices_locations.dta"
order year com_nom insee_com location_type region_capital, first

/* timing: 1700-1800 */
keep if (year >= 1700 & year < 1800)

/* summary statistics for the type of location */
fre location_type
drop location_type region_capital

/* number of different locations */
codebook insee_com

/* number of sources */
codebook source
drop source

/* match to grid */
merge m:1 year insee_com using "$TEMP/wheat_prices_grid", assert(2 3) nogenerate
erase "$TEMP/wheat_prices_grid.dta"
sort insee_com year
egen values = count(price), by(insee_com)
drop if values == 0 /* drop locations with no price quotes at all */
summarize values
drop values

/* average per location-decade (missing if no price quote in location-decade) */
generate obs = (!missing(price))
collapse (mean) price (sum) obs, by(decade insee_com)
summarize obs
fre obs

/* reshape dataset */
reshape wide price obs, i(insee_com) j(decade)

/* add coordinates */
replace insee_com = "69381" if insee_com == "69123" /* Lyon */
replace insee_com = "75101" if insee_com == "75056" /* Paris */

merge 1:1 insee_com using "$DATA/1_3_france/coordinates_rgf93", assert(2 3)  ///
	keep(3) nogen
order x y, after(insee_com)

/* export to QGIS to calculate spatial interpolation */
forvalues y = 1700 (10) 1790 {
	preserve
	keep insee_com x y price`y' obs`y'
	drop if obs`y' == 0
	export delimited using "$DATA/1_6_wheat_prices/wheat_prices_`y'.csv", replace
	restore
}

/* import interpolated prices from QGIS (see instructions; update to merge all years) */
import delimited "$DATA/1_6_wheat_prices/wheat_prices.csv", clear
keep insee_com pw1700mean-pw1790mean
forvalues y = 1700 (10) 1790 {
	local j = `y' + 9
	rename pw`y'mean pw`y'
	label variable pw`y' "Average wheat price in `y'-`j' (sous per liter) [Ridolfi 2019]"
}
save "$TEMP/wheat_prices", replace

* POPULATION AND POPULATION DENSITY IN 1793 AND 1800

/* load GEOFLA attribute table */
import excel "$DATA/1_7_population/ID_GEOFLA.xlsx", sheet("GEOFLA_2011") firstrow clear
keep INSEE_COM ID_GEOFLA
replace INSEE_COM = subinstr(INSEE_COM,"2B","20",1)
replace INSEE_COM = subinstr(INSEE_COM,"2A","20",1)
destring INSEE_COM, replace
rename INSEE_COM insee_com
duplicates report insee_com
assert r(N) == r(unique_value)
save "$TEMP/id_geofla1", replace

/* area */
import delimited "$DATA/1_7_population/geofla_area.csv", clear case(preserve)
rename SUPERFICIE area
save "$TEMP/geofla_area", replace

use "$TEMP/id_geofla1", clear
merge 1:1 ID_GEOFLA using "$TEMP/geofla_area", assert(3) nogenerate
drop ID_GEOFLA
replace area = area / 100
label variable area "Area (km2) [GEOFLA 2011]"
save "$TEMP/id_geofla", replace
erase "$TEMP/id_geofla1.dta"
erase "$TEMP/geofla_area.dta"

/* duplicate merged communes between 2015 and 2021 needed from the profile_id communes using table de passage */
import excel "$DATA/1_7_population/table_passage_annuelle_2021.xlsx", sheet("Liste des fusions") cellrange(A1:E2589) clear
drop in 1/6
rename C insee_com1
rename B insee_com
destring insee_com1 insee_com, replace
keep insee_com1 insee_com
by insee_com1, sort: egen rank = rank(insee_com)
keep if rank == 1
drop rank
merge 1:1 insee_com1 using "$DATA/1_7_population/missing_insee_com1_cassini", keep(3) nogenerate
save "$TEMP/toduplicate_insee_com1_cassini", replace

/* load population data */
use "$DATA/1_7_population/cassini_base", clear
keep pop_an3_val pop_an8_val com_2021
rename com_2021 insee_com
replace insee_com = subinstr(insee_com,"2B","20",1)
replace insee_com = subinstr(insee_com,"2A","20",1)
destring insee_com, replace
collapse (sum) pop_an3_val pop_an8_val, by(insee_com)

/* duplicates PLM arm */
expand 2 if insee_com == 13055
by insee_com, sort: generate dup = _n
replace insee_com = 13211 if dup == 2
drop dup
expand 3 if insee_com == 69123
by insee_com, sort: generate dup = _n
replace insee_com = 69383 if dup == 2
replace insee_com = 69389 if dup == 3
drop dup
expand 5 if insee_com == 75056
by insee_com, sort: generate dup = _n
replace insee_com = 75104 if dup == 2
replace insee_com = 75115 if dup == 3
replace insee_com = 75117 if dup == 4
replace insee_com = 75120 if dup == 5
drop dup

/* other mergers */
expand 3 if insee_com == 49080
by insee_com, sort: generate dup = _n
replace insee_com = 49065 if dup == 2
replace insee_com = 49096 if dup == 3
drop dup
expand 2 if insee_com == 49261
by insee_com, sort: generate dup = _n
replace insee_com = 49279 if dup == 2
drop dup
expand 2 if insee_com == 49261
by insee_com, sort: generate dup = _n
replace insee_com = 49346 if dup == 2
drop dup

/* duplicate merged communes between 2015 and 2021 */
merge 1:m insee_com using "$TEMP/toduplicate_insee_com1_cassini", keep(1 3)
expand 2 if _merge == 3
by insee_com insee_com1, sort: generate dup = _n
replace insee_com = insee_com1 if dup != 1
drop insee_com1 _merge dup
duplicates drop
replace pop_an3_val = . if pop_an3_val == 0
replace pop_an8_val = . if pop_an8_val == 0
duplicates report insee_com
assert r(N) == r(unique_value)
rename pop_an3 pop_1793
rename pop_an8 pop_1800

/* match population data */
merge 1:1 insee_com using "$TEMP/id_geofla", keep(3) nogenerate
erase "$TEMP/id_geofla.dta"
erase "$TEMP/toduplicate_insee_com1_cassini.dta"
rename insee_com insee_com1

replace pop_1793 = pop_1800 if missing(pop_1793) /* 7k ids over 189k */
replace pop_1800 = pop_1793 if missing(pop_1800) /* 493 ids over 189k */

label variable pop_1793 "Population in 1793 [Cristofoli et al. 2021]"
label variable pop_1800 "Population in 1800 [Cristofoli et al. 2021]"

generate pop_density_1793 = pop_1793 / area
generate pop_density_1800 = pop_1800 / area

label variable pop_density_1793 "Population density in 1793 (per km2) [Cristofoli et al. 2021, GEOFLA 2011]"
label variable pop_density_1800 "Population density in 1800 (per km2) [Cristofoli et al. 2021, GEOFLA 2011]"

/* use mean for the few missings */
foreach v of varlist pop_1793 pop_1800 pop_density_1793 pop_density_1800 {
	egen `v'_mean = mean(`v')
	replace `v' = `v'_mean if missing(`v')
	drop `v'_mean
}

save "$TEMP/cassini_population", replace

** ARRONDISSEMENTS CHEF LIEUX

import delimited "$DATA/1_7_population/arrondissements_cl_1837.csv", clear
generate rural_henry = 1
replace  rural_henry = 0 if urban_henry == 1
drop nom urban
label variable rural_henry "Rural municpality per Henry criteria [SGF 1837]"
save "$TEMP/rural_henry", replace

* ADMINISTRATIVE CENTERS

** EVECHES

import delimited "$DATA/1_8_admin_centers/eveches_distance.csv", varnames(1) clear
drop nom_comm x_cheflieu y_cheflieu hubname
rename hubdist dist_eveche
label variable dist_eveche "Distance to nearest évêché capital (km) [Nordman et al. 1989]"
drop if insee_com1 == .
save "$TEMP/eveches", replace

** RECETTES

import delimited "$DATA/1_8_admin_centers/recettes_distance.csv", varnames(1) clear
drop nom_comm x_cheflieu y_cheflieu hubname
rename hubdist dist_recette
label variable dist_recette "Distance to nearest recette des finances capital (km) [Nordman et al. 1989]"
drop if insee_com1 == .
save "$TEMP/recettes", replace

** BAILLIAGES

import delimited "$DATA/1_8_admin_centers/bailliages_distance.csv", varnames(1) clear
drop nom_comm x_cheflieu y_cheflieu hubname
rename hubdist dist_bailliage
label variable dist_bailliage "Distance to nearest bailliage capital (km) [Nordman et al. 1989]"
drop if insee_com1 == .
save "$TEMP/bailliages", replace

** SUBDELEGATIONS

import delimited "$DATA/1_8_admin_centers/subdelegations_distance.csv", varnames(1) clear
drop nom_comm x_cheflieu y_cheflieu hubname
rename hubdist dist_subdeleg
label variable dist_subdeleg "Distance to nearest subdélégation capital (km) [Nordman et al. 1989]"
drop if insee_com1 == .
save "$TEMP/subdelegations", replace

* POLITICAL SOCIETIES

import delimited "$DATA/1_9_political_societies/political_societies_distance.csv", varnames(1) clear
drop nom_comm x_cheflieu y_cheflieu hubname
rename hubdist dist_socpol
label variable dist_socpol "Distance to closest political society (km)"
drop if insee_com1 == .
label variable dist_socpol 		"Distance to nearest political society (km) [Boutier et al. 1992]"
save "$TEMP/political_societies", replace

* REBELLIONS AGAINST THE STATE IN 1779-1789

import delimited "$DATA/1_10_rebellions/rebellions_distance.csv", varnames(1) clear
drop nom_comm x_cheflieu y_cheflieu hubname
rename hubdist dist_rebellion
label variable dist_rebellion "Distance to nearest rebellion against the state in 1780s (km) [Gay 2025]"
drop if insee_com1 == .
save "$TEMP/rebellions", replace

* CASSINI ROADS

import delimited "$DATA/1_11_roads/roads_distance.csv", varnames(1) clear
drop nom_comm x_cheflieu y_cheflieu hubname
rename hubdist dist_cassini
drop if insee_com1 == .
label variable dist_cassini "Distance to nearest Cassini road in 1750-90 (km) [Perret et al. 2015]"
save "$TEMP/roads", replace

* HORSE-POST

import delimited "$DATA/1_12_horse_post/horse_post_distance.csv", varnames(1) clear
drop nom_comm x_cheflieu y_cheflieu hubname
rename hubdist dist_post
drop if insee_com1 == .
label variable dist_post "Distance to nearest horse-post relay in 1790 (km) [Albertus and Gay 2025]"
save "$TEMP/horse_post", replace

* CLERICAL PERIL

import delimited "$DATA/1_13_clerical_peril/clerical_peril.csv", varnames(1) clear
drop nom_comm x_cheflieu y_cheflieu cl_peril
drop if insee_com1 == .
label variable cl_peril_i 		"Share refractory clergy in 1791 (%), inputed from département if missing [Tackett 1984]"
save "$TEMP/clerical_peril", replace

* SOIL TEXTURE

** COMMUNE LEVEL

import dbase using "$DATA/1_14_soil_texture/SOIL_TEXTURE/COMMUNE_TEXTURE_INTERSECTED.dbf", clear case(lower)
order texture, last

/* weights */
by insee_com, sort: egen area_com = total(area)
generate area_sh = area / area_com

preserve
collapse (sum) area_sh, by(insee_com)
assert area_sh > 0.9999
restore

/* area of sandy soil */
generate sandy = 0
replace  sandy = 1 if texture < 1.5 /* definition */
replace sandy = .a if texture == .
generate area_sandy = area * sandy
drop sandy

/* calculate weighted averages */
generate mean_texture = texture * area_sh
collapse (sum) mean_texture area_sandy, by(insee_com id_geofla code_comm area_com)
replace mean_texture = . if mean_texture == 0
rename mean_texture texture

/* share of sandy soil */
generate sh_sandy = area_sandy / area_com
replace sh_sandy = . if texture == .
drop area*

label variable texture "Soil texture (weighted average) [INRA 1998]"

/* indicator for sandy */
generate sandy = 0
replace  sandy = 1 if texture < 1.5 /* definition */
replace sandy = .a if texture == .
label define sandy_lbl 0 "Not sandy" 1 "Sandy" .a "Missing texture"
label values sandy sandy_lbl
label variable sandy "Soil texture = sandy (indicator variable) [INRA 1998]"
label variable sh_sandy "Share of sandy soils [INRA 1998]"

drop id code
save "$TEMP/sandy", replace

** AVERAGE SOIL TEXTURE PER BUFFER 30KM AROUND TREATMENT FRONTIER

import dbase using "$DATA/1_14_soil_texture/SOIL_TEXTURE/BUFFER_TEXTURE_INTERSECTED.dbf", clear case(lower)
order texture, last

/* weights */
by fid, sort: egen area_fid = total(area)
generate area_sh = area / area_fid

preserve
collapse (sum) area_sh, by(fid)
assert area_sh > 0.9999
restore

/* area of sandy soil */
generate sandy = 0
replace  sandy = 1 if texture < 1.5 /* definition */
replace sandy = .a if texture == .
generate area_sandy = area * sandy
drop sandy

/* calculate weighted averages */
generate mean_texture = texture * area_sh
collapse (sum) mean_texture area_sandy, by(fid area_fid)
replace mean_texture = . if mean_texture == 0
rename mean_texture texture

/* share of sandy soil */
generate sh_sandy = area_sandy / area_fid
replace sh_sandy = . if texture == .
drop area*
label variable sh_sandy "Share of sandy soils [INRA 1998]"

rename sh_sandy sh_sandy_buffer
rename texture texture_buffer
rename fid segment_affected_50

save "$TEMP/sandy_buffer", replace

* RUGGEDNESS

** COMMUNE LEVEL

import dbase using "$DATA/1_15_ruggedness/RUGGEDNESS/RUGGEDNESS_COMMUNE_INTERSECTED.dbf", clear case(lower)

/* weights */
by insee_com, sort: egen area_com = total(area)
generate area_sh = area / area_com

preserve
collapse (sum) area_sh, by(insee_com)
assert area_sh > 0.9999
restore

/* calculate weighted averages */
generate mean_ruggedness = rugg * area_sh
collapse (sum) mean_ruggedness, by(insee_com id_geofla code_comm area_com)
replace mean_ruggedness = . if mean_ruggedness == 0
rename mean_ruggedness ruggedness

drop id_geofla code_comm area
label variable ruggedness "Ruggedness (100 meters) [Nunn Puga 2012]"

save "$TEMP/ruggedness", replace

** AVERAGE RUGGEDNESS PER BUFFER 30KM AROUND TREATMENT FRONTIER

import dbase using "$DATA/1_15_ruggedness/RUGGEDNESS/RUGGEDNESS_BUFFER_INTERSECTED.dbf", clear case(lower)

/* weights */
by fid, sort: egen area_buffer = total(area)
generate area_sh = area / area_buffer

preserve
collapse (sum) area_sh, by(fid)
assert area_sh > 0.9999
restore

/* calculate weighted averages by buffer zone */
generate mean_ruggedness = rugg * area_sh
collapse (sum) mean_ruggedness, by(fid)
rename mean_ruggedness ruggedness_buffer
label variable ruggedness_buffer "Average ruggedness in buffer 30km around frontier"
rename fid segment_affected_50
save "$TEMP/ruggedness_buffer", replace

* CALORIC SUITABILITY

import dbase using "$DATA/1_16_caloric/CALORIC_SUITABILITY/post1500AverageCalories_COMMUNE.dbf", clear case(lower)
keep insee_com avcalmean
label variable avcalmean "Average caloric suitability [Galor Ozäk 2016]"
save "$TEMP/caloric", replace

***************************************
* 3. MERGE ALL CONTROLS FOR GENI DATA *
***************************************

use "$DATA/1_2_geni/auxiliary_files/fr-clean-geoloc", clear
rename insee_com insee_com1

merge m:1 insee_com1 using "$TEMP/political_societies", assert(2 3) keep(3) nogenerate
merge m:1 insee_com1 using "$TEMP/eveches", assert(2 3) keep(3) nogenerate
merge m:1 insee_com1 using "$TEMP/recettes", assert(2 3) keep(3) nogenerate
merge m:1 insee_com1 using "$TEMP/subdelegations", assert(2 3) keep(3) nogenerate
merge m:1 insee_com1 using "$TEMP/bailliages", assert(2 3) keep(3) nogenerate
merge m:1 insee_com1 using "$TEMP/rebellions", assert(2 3) keep(3) nogenerate
merge m:1 insee_com1 using "$TEMP/roads", assert(2 3) keep(3) nogenerate
merge m:1 insee_com1 using "$TEMP/horse_post", assert(2 3) keep(3) nogenerate
merge m:1 insee_com1 using "$TEMP/clerical_peril", assert(2 3) keep(3) nogenerate
merge m:1 insee_com1 using "$TEMP/coordinates", assert(2 3) keep(3) nogenerate
merge m:1 insee_com using  "$TEMP/wheat_prices", assert(2 3) keep(3) nogenerate
merge m:1 insee_com1 using "$TEMP/data_treatments", assert(2 3) keep(3) nogenerate
merge m:1 insee_com1 using "$TEMP/bailliage", assert(2 3) keep(3) nogenerate
merge m:1 insee_com1 using "$TEMP/cassini_population", assert(2 3) keep(3) nogenerate
merge m:1 insee_com using  "$TEMP/rural_henry", assert(2 3) keep(3) nogenerate
merge m:1 insee_com using  "$TEMP/sandy", keep(1 3) nogenerate /* 11 missing */
merge m:1 segment_affected_50 using "$TEMP/sandy_buffer", assert(2 3) keep(1 3) nogenerate
merge m:1 insee_com using "$TEMP/ruggedness", assert(2 3) keep(3) nogenerate
merge m:1 segment_affected_50 using "$TEMP/ruggedness_buffer", assert(2 3) keep(3) nogenerate
merge m:1 insee_com using  "$TEMP/caloric", keep(1 3) nogenerate

/* deal with PLM */
replace insee_com = "75056" if insee_com == "75104" | insee_com == "75120" | ///
	insee_com == "75115" | insee_com == "75117"
replace insee_com1 = 75056 if insee_com1 == 75104 | insee_com1 == 75120 | ///
	insee_com1 == 75115 | insee_com1 == 75117
replace insee_com = "69123" if insee_com == "69383" | insee_com == "69389"
replace insee_com1 = 69123 if insee_com1 == 69383 | insee_com1 == 69389
replace insee_com = "13055" if insee_com == "13211"
replace insee_com1 = 13055 if insee_com1 == 13211

replace rural_henry = 0 if insee_com == "75056"
replace rural_henry = 0 if insee_com == "69123"
replace rural_henry = 0 if insee_com == "13055"

order nom_comm, after(insee_com)
label variable nom_comm "Commune name"
compress

save "$TEMP/geni_controls", replace

* ERASE TEMPORARY FILES

erase "$TEMP/coordinates.dta"
erase "$TEMP/bailliage.dta"
erase "$TEMP/wheat_prices.dta"
erase "$TEMP/rural_henry.dta"
erase "$TEMP/eveches.dta"
erase "$TEMP/recettes.dta"
erase "$TEMP/bailliages.dta"
erase "$TEMP/subdelegations.dta"
erase "$TEMP/political_societies.dta"
erase "$TEMP/rebellions.dta"
erase "$TEMP/roads.dta"
erase "$TEMP/horse_post.dta"
erase "$TEMP/clerical_peril.dta"
erase "$TEMP/sandy.dta"
erase "$TEMP/caloric.dta"
erase "$TEMP/ruggedness.dta"
erase "$TEMP/cassini_population.dta"
erase "$TEMP/ruggedness_buffer.dta"
erase "$TEMP/sandy_buffer.dta"
erase "$TEMP/segment_affected_100_length.dta"
erase "$TEMP/segment_affected_50_length.dta"
erase "$TEMP/data_treatments.dta"

timer off 1 /* 9.5 seconds */
timer list